Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.
library(knitr)
library(reticulate)
#Spatial analyses packages
library(raster)
library(sp)
library(rgdal)
library(maps)
library(geosphere)
#Data wrangling and data viz
library(tidyverse)
library(purrr)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(pheatmap)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
library(ggtext)
library("UpSetR")
library(scatterpie)
#Pop structure and pop genomic
library(GenomicFeatures)
library(gridExtra)
library(ggtree)
library(tidytree)
library(multcomp)
library(lsmeans)
#GEA
library(lfmm)
library(GWASTools)
library(topGO)
#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)
#Variables
world <- map_data("world")
project_dir="/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"
#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "/Users/feurtey/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"
#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
mito_SV = paste0(VAR_dir, "6_Mito_SV/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")
#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
effectors_annotation_file = "/Users/feurtey/Documents/Postdoc_Eva/Manuscripts/Accepted/Alice_Cecile_Comparative_genomics/Data_for_publication/Annotations_2018_genomes_for_publication.tab"
eggnog_annotation = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.eggnog")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
metadata_name = "Main_table_from_SQL_Feb_2020"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))
Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)
Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)
Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)
#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)
# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
mutate(Filter = "Mutants_etc"))
##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")
## Metadata of genotyped samples
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()
Zt_meta = read_tsv(paste0(metadata_dir, metadata_name, "_with_collection.tab"),
col_names = temp,
na = "\\N", guess_max = 2000) %>%
unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
inner_join(., genotyped_samples) %>%
mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2))
#genotyped_samples %>%
# filter(!(ID_file %in% filtered_samples$ID_file)) %>%
# write_tsv(Zt_list, col_names = F)
#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)
## For clustering
K_colors = c("#f9c74f", "#f9844a", "#90be6d", "#f5cac3",
"#83c5be", "#f28482", "#577590", "#e5e5e5", "#a09abc", "#52796f",
"#219ebc", "#003049", "#82c0cc", "#283618", "white")
## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)
#Random gradients
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
#Bioclim data (from Worldclim)
Bioclim_var = c("Annual Mean Temperature", "Mean Diurnal Range ",
"Isothermality (BIO2/BIO7) (×100)", "Temperature Seasonality (standard deviation ×100)",
"Max Temperature of Warmest Month", "Min Temperature of Coldest Month",
"Temperature Annual Range (BIO5-BIO6)",
"Mean Temperature of Wettest Quarter","Mean Temperature of Driest Quarter",
"Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter",
"Annual Precipitation", "Precipitation of Wettest Month",
"Precipitation of Driest Month", "Precipitation Seasonality (Coefficient of Variation)",
"Precipitation of Wettest Quarter", "Precipitation of Driest Quarter",
"Precipitation of Warmest Quarter","Precipitation of Coldest Quarter")
pheno_cat = tibble(Phenotype = c(
"Max Temperature of Warmest Month", "Mean Temperature of Warmest Quarter",
"Min Temperature of Coldest Month", "Mean Temperature of Coldest Quarter",
"Precipitation of Driest Month", "Precipitation of Driest Quarter",
"Precipitation of Wettest Month", "Precipitation of Wettest Quarter"),
Category = c("Warmth", "Warmth", "Cold", "Cold",
"Drought", "Drought", "Rain", "Rain"))
pheno_table = tibble(BIO_ID = paste0("BIO", c(1:19)),
Phenotype = Bioclim_var,
Category = c("Temperature", "Temperature range", "Temperature range", "Temperature range",
"Temperature", "Temperature", "Temperature range", "Temperature/rain",
"Temperature/rain", "Temperature", "Temperature", "Precipitation",
"Precipitation", "Precipitation", "Precipitation range", "Precipitation",
"Precipitation", "Temperature/rain", "Temperature/rain"))
#Bioclim_var[c(2, 3, 7)] = "Ranges bioclimatic variables"
#Bioclim_var[c(4, 15)] = "Seasonality bioclimatic variables"
#Bioclim_var[c(1, 12) = "Annual rain and temperature"
#Bioclim_var[c(8, 9, 17, 18)] = "Mix rain and temperature"
I will use the sampling site of each isolate (as precisely as I can manage) to approximate environmental parameters such as temperature, or precipitation. One possibility to find such data is this website, https://www.worldclim.org/data/worldclim21.html (published in Fick and Hijmans, 2017), which gives access to climate data for 1970-2018. These can be transformed into bioclimatic variables using the biovars function from the R package dismo (https://rdrr.io/cran/dismo/man/biovars.html).
temp = Zt_meta %>%
#filter(!(ID_file %in% filtered_samples$ID_file)) %>%
filter(!is.na(Longitude)) %>%
mutate(X = as.numeric(unlist(Longitude)),
Y = as.numeric(unlist(Latitude))) %>%
dplyr::select(X, Y) %>%
distinct()
sp = SpatialPoints(temp[, c("X", "Y")])
summary(sp)
## Object of class SpatialPoints
## Coordinates:
## min max
## X -122.9300 175.6576
## Y -46.2187 59.3294
## Is projected: NA
## proj4string : [NA]
## Number of points: 292
bio_list = list()
for (i in c(1:length(Bioclim_var))) {
file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif")
rast_temp = raster(file_name)
bio_list[[Bioclim_var[i]]] = raster::extract(rast_temp, sp)
}
climate_per_coordinates = cbind(temp, do.call(cbind, bio_list))
dat = Zt_meta %>%
#filter(!(ID_file %in% filtered_samples$ID_file)) %>%
dplyr::select(ID_file, Latitude, Longitude, Continent) %>%
full_join(., climate_per_coordinates,
by= c("Longitude" = "X", "Latitude" = "Y")) %>%
dplyr::select(-ID_file) %>%
gather(key = "Bioclim_var", value = "Estimate", -c(Longitude, Latitude, Continent))
#Define stable colors
#myColors = c("#ec9a29", "#0f8b8d", "#143642")
#names(myColors) = levels(factor(dat$Continent))
#colScale = scale_colour_manual(name = "Continent", values = myColors)
#colScaleFill = scale_fill_manual(name = "Continent", values = myColors)
ggplot(dat, aes(Estimate, fill =Continent, color =Continent)) +
geom_histogram(position = "stack") +
facet_wrap(.~Bioclim_var, scales = "free") +
theme_classic() + Color_Continent + Fill_Continent
Naturally, many of the variables investigated above are highly correlated. It is intuitive for example that the minimum temperature of the coldest month would be correlated to the average temperature of the coldest quarter! Here, I visualize these correlations.
#Correlogram
Ccor = cor(climate_per_coordinates[, c(3:ncol(climate_per_coordinates))])
corrplot(Ccor, type="lower", order="hclust",
col = mycolorsCorrel,
tl.col="black", tl.srt=45, tl.cex = 0.7)
#Simple heatmap
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)
Ccor[abs(Ccor) <= 0.75] <-0
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)
tidy_cors <- climate_per_coordinates %>%
dplyr::select(everything(), -contains("uarter"), -Y, -X) %>%
correlate() %>%
stretch()
graph_cors <- tidy_cors %>%
#filter(abs(r) > .2) %>%
graph_from_data_frame(directed = FALSE)
ggraph(graph_cors) +
geom_edge_link(aes(edge_alpha = .5, color = r)) +
guides(edge_alpha = "none", edge_width = "none") +
scale_edge_colour_gradientn(limits = c(-1, 1), colors = c("#0f8b8d", "white", "#a8201a")) +
#geom_node_point(color = "black", size = 3) +
geom_node_text(aes(label = name), size = 2) +
theme_graph() +
labs(title = "Correlations between bioclimatic variables")
#
tidy_cors <- climate_per_coordinates %>%
dplyr::select( -Y, -X) %>%
correlate() %>%
stretch()
graph_cors <- tidy_cors %>%
#filter(abs(r) > .75) %>%
mutate(r = ifelse(abs(r) <= 0.75, 0, r)) %>%
graph_from_data_frame(directed = FALSE)
ggraph(graph_cors) +
geom_edge_link(aes(edge_alpha = .5, color = r)) +
guides(edge_alpha = "none", edge_width = "none") +
scale_edge_colour_gradientn(limits = c(-1, 1), colors = c("#0f8b8d", "white", "#a8201a")) +
#geom_node_point(color = "black", size = 3) +
geom_node_text(aes(label = name), size = 2) +
theme_graph() +
labs(title = "Correlations between bioclimatic variables")
#Create standardized values for the environment variables
climate_per_coord_standard = climate_per_coordinates %>%
pivot_longer(cols = -c(X, Y), names_to = "Variable_climate", values_to = "Value") %>%
group_by(Variable_climate) %>%
mutate(Standard_value = scale(Value)) %>%
dplyr::select(-Value) %>%
pivot_wider(names_from = Variable_climate, values_from = Standard_value)
#The fam file should be same for all core chromosomes, hopefully. So I only have to create one to get the file format including the phenotypes. This can then be used for all chromosomes.
temp2 = left_join(read_tsv(Zt_list, col_names = "ID_file") %>% mutate(ID2 = ID_file),
Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
bind_cols(., tibble(X1 = rep(0, nrow(Zt_meta)), X2 = rep(0, nrow(Zt_meta)), X3 = rep(0, nrow(Zt_meta)))) %>%
left_join(., climate_per_coord_standard,
by = c("Latitude" = "Y", "Longitude" = "X"))
dplyr::select(temp2, -Latitude, -Longitude) %>%
write_delim(.,
"~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/Ztmeta.fam",
delim = " ", col_names = F)
left_join(read_tsv(Zt_list, col_names = "ID_file") %>% mutate(ID2 = ID_file),
Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
bind_cols(., tibble(X1 = rep(0, nrow(Zt_meta)), X2 = rep(0, nrow(Zt_meta)), X3 = rep(0, nrow(Zt_meta)))) %>%
left_join(., climate_per_coordinates,
by = c("Latitude" = "Y", "Longitude" = "X")) %>%
dplyr::select(-Latitude, -Longitude) %>%
write_delim(.,
"~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/Ztmeta_non_standard.fam",
delim = " ", col_names = F)
#Transfer on the cluster
#rsync -avP ~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/Ztmeta.fam alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Phenotypes_with_climatic_variables.fam
#To run gemma on the cluster: conda activate env0
#Commands are then in the format gemma -h
First, I read the results that I got from running GEMMA with a LOCO strategy (i.e., leave-one-chromosome-out).
results_GEMMA = list()
for (i in c(1:length(Bioclim_var))) {
temp = list.files(path = GEA_dir, pattern = paste0("*_for_phenotype_", i,".assoc.txt")) %>%
map_df(~read_tsv(file = paste0(GEA_dir, .), col_types = c("dcddccdddddd"))) %>%
unite(col = SNP, chr, ps, sep = "_", remove = F) %>%
dplyr::select(SNP, CHR = chr, BP = ps, P = p_wald, zscore = logl_H1) %>%
mutate(Phenotype = Bioclim_var[i], Nb = i)
results_GEMMA[[Bioclim_var[i]]] = temp
}
results_GEMMA = bind_rows(results_GEMMA)
core_temperature_variable = c("Max Temperature of Warmest Month", "Min Temperature of Coldest Month")
core_rain_variable = c("Precipitation of Wettest Month", "Precipitation of Driest Month")
#Genomic Inflation Factor
kable(results_GEMMA %>%
group_by(Phenotype) %>%
summarize(Genomic_Inflation_Factor = median(qchisq(1-P,1))/qchisq(0.5,1)) %>%
arrange(Genomic_Inflation_Factor))
| Phenotype | Genomic_Inflation_Factor |
|---|---|
| Precipitation of Warmest Quarter | 1.061468 |
| Mean Temperature of Wettest Quarter | 1.065765 |
| Precipitation of Driest Month | 1.080538 |
| Precipitation Seasonality (Coefficient of Variation) | 1.086267 |
| Mean Temperature of Driest Quarter | 1.086815 |
| Mean Diurnal Range | 1.089768 |
| Isothermality (BIO2/BIO7) (×100) | 1.101098 |
| Max Temperature of Warmest Month | 1.106971 |
| Annual Precipitation | 1.109024 |
| Precipitation of Coldest Quarter | 1.109791 |
| Precipitation of Driest Quarter | 1.110470 |
| Mean Temperature of Warmest Quarter | 1.113279 |
| Temperature Seasonality (standard deviation ×100) | 1.127099 |
| Min Temperature of Coldest Month | 1.127293 |
| Mean Temperature of Coldest Quarter | 1.130714 |
| Precipitation of Wettest Quarter | 1.131847 |
| Temperature Annual Range (BIO5-BIO6) | 1.142874 |
| Precipitation of Wettest Month | 1.146354 |
| Annual Mean Temperature | 1.149803 |
#QQ-plots
par(mfrow=c(3,3))
for (i in c(1:length(Bioclim_var))){
temp = results_GEMMA %>% filter(Phenotype == Bioclim_var[i]) %>% dplyr::sample_frac(size = .10) %>% dplyr::select(P) %>% pull()
GWASTools::qqPlot(temp, thinThreshold = 2)
}
To determine which variants are significant, we use the Bonferroni correction method, i.e. we divide the traditional threshold of significance of 0.05 by the number of variants that we tested. Based on this corrected threshold, we can then identify variants significantly associated with each environmental condition that we tested.
head(results_GEMMA)
## # A tibble: 6 x 7
## SNP CHR BP P zscore Phenotype Nb
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <int>
## 1 1_110857 1 110857 0.612 -981. Annual Mean Temperature 1
## 2 1_111115 1 111115 0.0460 -979. Annual Mean Temperature 1
## 3 1_111124 1 111124 0.594 -980. Annual Mean Temperature 1
## 4 1_111127 1 111127 0.599 -980. Annual Mean Temperature 1
## 5 1_111168 1 111168 0.555 -980. Annual Mean Temperature 1
## 6 1_111283 1 111283 0.794 -981. Annual Mean Temperature 1
Bonferroni_threshold = results_GEMMA %>% dplyr::count(Nb) %>%
mutate(Bonferroni_threshold = 0.05/n) %>% summarize(average = mean(Bonferroni_threshold)) %>% pull()
kable(results_GEMMA %>%
filter(P <= Bonferroni_threshold) %>%
dplyr::count(Phenotype) %>%
arrange(n))
| Phenotype | n |
|---|---|
| Precipitation of Driest Month | 36 |
| Precipitation of Driest Quarter | 38 |
| Mean Temperature of Driest Quarter | 41 |
| Mean Temperature of Wettest Quarter | 58 |
| Mean Temperature of Warmest Quarter | 60 |
| Precipitation of Wettest Month | 66 |
| Mean Diurnal Range | 81 |
| Annual Precipitation | 82 |
| Max Temperature of Warmest Month | 85 |
| Precipitation of Wettest Quarter | 98 |
| Precipitation of Warmest Quarter | 136 |
| Temperature Seasonality (standard deviation ×100) | 181 |
| Precipitation of Coldest Quarter | 194 |
| Temperature Annual Range (BIO5-BIO6) | 253 |
| Annual Mean Temperature | 263 |
| Mean Temperature of Coldest Quarter | 344 |
| Isothermality (BIO2/BIO7) (×100) | 353 |
| Precipitation Seasonality (Coefficient of Variation) | 399 |
| Min Temperature of Coldest Month | 536 |
significant = results_GEMMA %>%
filter(P <= Bonferroni_threshold) %>% group_by(SNP, CHR, BP) %>%
mutate(Count = n()) %>% mutate(SNP_type = "significant")%>%
ungroup()
#temp = significant %>% ungroup() %>% pivot_wider(values_fill = 1, values_from = P, names_from = Phenotype) %>%
# dplyr::select(-CHR, -BP, - zscore, -Nb)
significant %>%
dplyr::select(CHR, BP) %>%
distinct() %>%
write_delim(paste0(GEA_dir, "Significant_GEMMA.tsv"), delim = "\t", col_names = F)
Identifying the variants which are significant is very useful already, but it is hard to work with so many “independent” positions. It also does not make sense conceptually, as nearby variants are probably picking up the exact same signal through linkage. So, here I gather variants together into “significant loci” if they are no bigger gaps than a certain threshold.
#Distance between two significant SNPs to include them in the same region
len_threshold = 20000L
#For each phenotype and for each chromosome
list_loci = list()
for (j in c(1:length(Bioclim_var))) {
temp2 = list()
for (i in c(1:13)){
#Pull vector of position for the right chromosome and phenotype
v = filter(significant, CHR == i) %>%
filter(Phenotype == Bioclim_var[j]) %>%
dplyr::select(CHR, BP) %>%
distinct() %>% pull(BP) %>% sort()
# Split the vectors into group based on the length threshold
temp = split(v, cumsum(c(TRUE, diff(v) >= len_threshold)))
#Single value gets the column name "value" instead of a number
if (length(temp) == 1) {
temp_tibble = as.tibble(sapply(temp, '[', seq(max(sapply(temp, length)))))
colnames(temp_tibble) <- 1
} else {
temp_tibble = as.tibble(sapply(temp, '[', seq(max(sapply(temp, length)))))
}
# Transform to store relevant info in tidy format
temp2[[i]] = temp_tibble %>%
pivot_longer(cols =everything(), names_to = "Locus_nb", values_to = "BP") %>%
filter(complete.cases(.)) %>%
mutate(CHR = i, Phenotype = Bioclim_var[j]) %>%
ungroup()
}
list_loci[[j]] = bind_rows(temp2)
}
#Adding loci info to the table
significant = bind_rows(list_loci) %>%
full_join(significant, .)
significant %>% dplyr::select(Locus_nb, Phenotype) %>%
distinct() %>%
dplyr::count(Phenotype)
## # A tibble: 19 x 2
## Phenotype n
## <chr> <int>
## 1 "Annual Mean Temperature" 18
## 2 "Annual Precipitation" 7
## 3 "Isothermality (BIO2/BIO7) (×100)" 26
## 4 "Max Temperature of Warmest Month" 11
## 5 "Mean Diurnal Range " 7
## 6 "Mean Temperature of Coldest Quarter" 18
## 7 "Mean Temperature of Driest Quarter" 6
## 8 "Mean Temperature of Warmest Quarter" 7
## 9 "Mean Temperature of Wettest Quarter" 5
## 10 "Min Temperature of Coldest Month" 17
## 11 "Precipitation of Coldest Quarter" 12
## 12 "Precipitation of Driest Month" 3
## 13 "Precipitation of Driest Quarter" 3
## 14 "Precipitation of Warmest Quarter" 11
## 15 "Precipitation of Wettest Month" 10
## 16 "Precipitation of Wettest Quarter" 7
## 17 "Precipitation Seasonality (Coefficient of Variation)" 23
## 18 "Temperature Annual Range (BIO5-BIO6)" 15
## 19 "Temperature Seasonality (standard deviation ×100)" 17
To identify potentially more important variants amongst the significantly associated variants, we can predict the effect of the mutation on the protein sequence of genes. For this, I use SNPeff.
#rsync -avP ../5_GEA/Significant_GEMMA.tsv alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Significant_GEMMA.tsv
#../Software/vcftools_jydu/src/cpp/vcftools --gzvcf /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.ann.vcf.gz --positions /data2/alice/WW_project/5_GEA/Significant_GEMMA.tsv --out /data2/alice/WW_project/5_GEA/Significant_GEMMA --recode --recode-INFO-all
#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/ANN\n' /data2/alice/WW_project/5_GEA/Significant_GEMMA.vcf > /data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tsv
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tsv ../5_GEA/
#grep "#CHROM" /data2/alice/WW_project/5_GEA/Significant_GEMMA.vcf | cut -f 1,2,4,5,10- | sed 's/#//' > /data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tab
#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' /data2/alice/WW_project/5_GEA/Significant_GEMMA.vcf >> /data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tab
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tab ../5_GEA/
temp = read_tsv(paste0(GEA_dir, "Significant_GEMMA.ann.tsv"), col_names = c("CHR", "BP", "REF", "ALT", "ANN")) %>%
separate(ANN, sep = ",",
into = c("ANN1", "ANN2", "ANN3", "ANN4", "ANN5", "ANN6", "ANN7", "ANN8", "ANN9", "ANN10", "ANN11",
"ANN12", "ANN13", "ANN14", "ANN15", "ANN16", "ANN17"),
extra = "warn")
signif_ann = temp %>%
pivot_longer(cols = c(-CHR, -BP, -REF, -ALT), names_to = "ANN", values_to = "Annotation") %>%
filter(!is.na(Annotation)) %>%
separate(Annotation, into = c("ALT2", "Variant_type", "Effect_strength", "Gene", "Rest"), sep = "\\|", extra = "merge") %>%
filter(!str_detect(Gene, "-")) %>%
separate(Gene, into = c("temp", "Chrom", "Gene_nb"), remove = FALSE) %>%
dplyr::select(-ANN, -ALT2, -temp) %>%
mutate(Gene_nb = as.numeric(Gene_nb)) %>%
left_join(., read_tsv(effectors_annotation_file) %>%
filter(Sample == "Zt09") %>%
mutate(Gene = str_replace(Protein_ID, "_chr", "")))
signif_ann %>% dplyr::count(CHR, Gene_nb, Effect_strength) %>%
pivot_wider(names_from = Effect_strength, values_from = n, values_fill = 0)
## # A tibble: 1,435 x 6
## CHR Gene_nb MODIFIER MODERATE LOW HIGH
## <dbl> <dbl> <int> <int> <int> <int>
## 1 1 229 1 0 0 0
## 2 1 230 1 0 0 0
## 3 1 231 1 0 0 0
## 4 1 232 0 1 0 0
## 5 1 233 1 0 0 0
## 6 1 320 4 0 0 0
## 7 1 321 4 0 0 0
## 8 1 322 4 0 0 0
## 9 1 323 3 1 0 0
## 10 1 324 4 0 0 0
## # … with 1,425 more rows
pheno = "Max Temperature of Warmest Month"
temp = results_GEMMA %>% filter(Phenotype == pheno) %>%
left_join(., significant %>% dplyr::filter(Phenotype == pheno) ) %>%
mutate(SNP_type = ifelse(is.na(SNP_type), "Other", SNP_type)) %>%
mutate(logP = -log10(P)) %>%
left_join(., signif_ann) %>%
filter(SNP_type == "significant")
We have several variables that could be interpreted the same way. For example, we expect the amount of rain during the driest month and the driest quarter to give us similar information related to drought tolerance. It could be useful to identify useful SNPs to find the ones that are significantly associated to all variables of the same type. Let’s see if the p-values seem correlated between variables of the same types and what amount of significant variants are shared.
reshape_GEMMA_results <- function(phenotype_list) {
temp = results_GEMMA %>%
filter(Phenotype %in% phenotype_list) %>%
mutate(logP = -log10(P)) %>%
filter(logP > 2) %>%
mutate(Phenotype_code = ifelse(Phenotype == phenotype_list[1], "Pheno1", "Pheno2"))%>%
dplyr::select(SNP, Phenotype_code, logP)
temp = temp %>% pivot_wider(values_from = logP, names_from = Phenotype_code) %>%
mutate(Signif_max = ifelse(Pheno1 >= -log10(Bonferroni_threshold), "1", "0"),
Signif_mean = ifelse(Pheno2 >= -log10(Bonferroni_threshold), "1", "0")) %>%
unite(Significant, Signif_max, Signif_mean, sep = "", remove = F)
temp[complete.cases(temp),]
}
dot_plot_comparison <- function(reshaped_GEMMA_results, phenotype_list) {
ggplot(temp, aes(x = Pheno1, y = Pheno2, col = Significant)) +
geom_point() +
geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20")+
geom_vline(aes(xintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20") +
theme_classic() +
labs(x = phenotype_list[1], y = phenotype_list[1])
}
# Variables related to temperatures
# _________________________
warm_temp_var = c("Max Temperature of Warmest Month", "Mean Temperature of Warmest Quarter")
cold_temp_var = c("Min Temperature of Coldest Month", "Mean Temperature of Coldest Quarter")
#Significant for warm temperatures
temp = reshape_GEMMA_results(warm_temp_var)
table(temp[,"Significant"])
##
## 00 01 10 11
## 15613 30 55 30
p1 = dot_plot_comparison(temp, warm_temp_var)
#Significant for cold temperatures
temp = reshape_GEMMA_results(cold_temp_var)
table(temp[,"Significant"])
##
## 00 01 10 11
## 19057 29 221 315
p2 = dot_plot_comparison(temp, cold_temp_var)
cowplot::plot_grid(p1 +theme(legend.position = "none"), p2 + theme(legend.position = "none"))
# Variables related to rain
# _________________________
drought_var = c("Precipitation of Driest Month", "Precipitation of Driest Quarter")
rainy_var = c("Precipitation of Wettest Month", "Precipitation of Wettest Quarter")
core_rain_variable
## [1] "Precipitation of Wettest Month" "Precipitation of Driest Month"
#Significant for drought
temp = reshape_GEMMA_results(drought_var)
table(temp[,"Significant"])
##
## 00 01 10 11
## 14814 6 4 32
p1 = dot_plot_comparison(temp, drought_var)
#Significant for high precipitation
temp = reshape_GEMMA_results(rainy_var)
table(temp[,"Significant"])
##
## 00 01 10 11
## 16088 29 6 60
p2 = dot_plot_comparison(temp, rainy_var)
cowplot::plot_grid(p1 +theme(legend.position = "none"), p2 + theme(legend.position = "none"))
Instead of comparing the results based on a priori knowledge of the underlying wheather/climate patterns for each variables, it is possible tosimply look at the correlation between all pairs of variables. This yields a matrix that is similar to the one shown in the previous section but this time it takes into account the p-values in each test and not the values of the bioclimatic variables themselves.
# Keep only variants significant in at least one association test
temp = results_GEMMA %>%
ungroup() %>%
dplyr::select(-CHR, -BP, -Nb, -zscore) %>%
distinct() %>%
pivot_wider(names_from = Phenotype, values_from = P) %>%
dplyr::select(-SNP) %>%
filter(rowSums(. <= Bonferroni_threshold) >= 1)
#Correlation
Ccor = cor(temp)
#Correlogram
corrplot.mixed(Ccor, upper = "ellipse",
#col = mycolorsCorrel,
tl.col="black", tl.cex = 0.5, number.cex = .5)
#Simple heatmap
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)
It is also possible that some genes or some variants could be associated to different phenotypes. Here, we can use upset plots to visualise different comparisons:
# Per gene and per phenotype
gene_detected = inner_join(pheno_cat, significant) %>%
mutate(logP = -log10(P)) %>%
left_join(., signif_ann) %>%
mutate(Present = 1) %>%
ungroup() %>%
dplyr::select(Phenotype, Gene, Present) %>%
distinct() %>%
filter(!is.na(Gene)) %>%
pivot_wider(names_from = Phenotype, values_from = Present, values_fill = 0)
for_upset = as.data.frame(gene_detected[,-1])
rownames(for_upset) = pull(gene_detected[,1])
upset(for_upset, sets = names(for_upset),
order.by = "freq", mb.ratio = c(0.5, 0.5),
mainbar.y.label = "Gene numbers")
#Per variant and per phenotype
temp = inner_join(pheno_cat, significant)
temp2 = mutate(temp, Present = 1) %>%
ungroup() %>%
dplyr::select(Phenotype, SNP, Present) %>%
distinct() %>%
pivot_wider(names_from = Phenotype, values_from = Present, values_fill = 0)
for_upset = as.data.frame(temp2[,-1])
rownames(for_upset) = pull(temp2[,1])
upset(for_upset, sets = names(for_upset),
order.by = "freq", mb.ratio = c(0.5, 0.5),
mainbar.y.label = "Variant numbers")
#Category: found in both
n_count = temp %>% group_by(Category) %>%
dplyr::count(SNP)
signif_in_pairs = inner_join(temp, n_count %>% filter(n == 2)) %>%
mutate(Present = ifelse(n == 2, 1, 0)) %>%
dplyr::select(Category, SNP, Present) %>%
distinct() %>%
pivot_wider(names_from = Category, values_from = Present, values_fill = 0)
for_upset = as.data.frame(signif_in_pairs[,-1])
rownames(for_upset) = pull(signif_in_pairs[,1])
upset(for_upset, sets = names(for_upset), order.by = "freq", empty.intersections = "on" ,
mainbar.y.label = "Variant numbers found in all phenotypes")
#Per category: found in at least one
signif_in_pairs = inner_join(temp, n_count) %>%
mutate(Present = ifelse(n == 2, 1, 1)) %>%
dplyr::select(Category, SNP, Present) %>%
distinct() %>%
pivot_wider(names_from = Category, values_from = Present, values_fill = 0)
for_upset = as.data.frame(signif_in_pairs[,-1])
rownames(for_upset) = pull(signif_in_pairs[,1])
upset(for_upset, sets = names(for_upset),
order.by = "freq", empty.intersections = "on",
mainbar.y.label = "Variant numbers found in at least one phenotype" )
library(topGO)
pheno = "Temperature"
gene_detected = inner_join(pheno_table, significant) %>%
mutate(logP = -log10(P)) %>%
left_join(., signif_ann) %>%
filter(Effect_strength %in% c("HIGH", "MODERATE"))
geneNames = read_tsv(gff_file,
col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot")) %>%
filter(mRNA == "mRNA") %>%
separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
mutate(Name = str_remove(Name, "Name=")) %>%
pull(Name)
myInterestingGenes <- gene_detected %>%
filter(Category == pheno) %>%
pull(Gene) %>% unique()
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
str(geneList)
## Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "names")= chr [1:11839] "Zt09_5_00296" "Zt09_5_00590" "Zt09_5_00072" "Zt09_5_00316" ...
geneID2GO = readMappings(paste0(data_dir, "GO_Zt09.map"))
GOdata <- new("topGOdata", ontology = "MF", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
resultKS <- runTest(GOdata, algorithm = "classic", statistic = "ks")
resultKS.elim <- runTest(GOdata, algorithm = "elim", statistic = "ks")
allRes <- GenTable(GOdata, classicFisher = resultFisher,
classicKS = resultKS, elimKS = resultKS.elim,
orderBy = "elimKS", ranksOf = "classicFisher", topNodes = 10)
allRes
## GO.ID Term Annotated Significant
## 1 GO:0016409 palmitoyltransferase activity 10 0
## 2 GO:0003727 single-stranded RNA binding 19 0
## 3 GO:0019904 protein domain specific binding 14 0
## 4 GO:0070063 RNA polymerase binding 20 0
## 5 GO:0003746 translation elongation factor activity 7 0
## 6 GO:0001181 RNA polymerase I general transcription i... 3 0
## 7 GO:0051010 microtubule plus-end binding 10 0
## 8 GO:0015081 sodium ion transmembrane transporter act... 7 0
## 9 GO:0070181 small ribosomal subunit rRNA binding 6 0
## 10 GO:0043175 RNA polymerase core enzyme binding 19 0
## Expected Rank in classicFisher classicFisher classicKS elimKS
## 1 0.13 251 1 0.0041 0.0041
## 2 0.25 252 1 0.0051 0.0051
## 3 0.18 253 1 0.0070 0.0070
## 4 0.26 254 1 0.0078 0.0078
## 5 0.09 255 1 0.0085 0.0085
## 6 0.04 256 1 0.0089 0.0089
## 7 0.13 257 1 0.0117 0.0117
## 8 0.09 258 1 0.0124 0.0124
## 9 0.08 259 1 0.0135 0.0135
## 10 0.25 260 1 0.0150 0.0150
showSigOfNodes(GOdata, score(resultKS.elim), firstSigNodes = 5, useInfo = 'all')
## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 18
## Number of Edges = 18
##
## $complete.dag
## [1] "A graph with 18 nodes."
Traditionally, GWAS results are represented as Manhattan plot. Let’s create some for the main variables we are interested in.
# Compute chromosome size and cumulative position of each chromosome
results_GEMMA_for_plot <- results_GEMMA %>%
group_by(CHR) %>%
dplyr::summarise(chr_len=max(BP)) %>%
mutate(tot=cumsum(chr_len)-chr_len) %>%
dplyr::select(-chr_len)
#EStimate middle of chromosome to center the labels
axisdf = results_GEMMA_for_plot %>%
inner_join( ., results_GEMMA %>% filter(Nb == 1)) %>%
arrange(CHR, BP) %>%
mutate(BPcum = BP+tot) %>%
group_by(CHR) %>%
dplyr::summarise(center=( max(BPcum) + min(BPcum) ) / 2 )
# Manhattan plot function
Manhattan_one_pheno <- function(pheno, color_duo) {
temp = results_GEMMA %>% filter(Phenotype == pheno) %>%
inner_join(., results_GEMMA_for_plot) %>%
left_join(., significant %>% dplyr::filter(Phenotype == pheno) ) %>%
mutate(SNP_type = ifelse(is.na(SNP_type), "Other", SNP_type)) %>%
mutate(logP = -log10(P)) %>%
filter(CHR <= 13) %>%
filter(logP > 2)
ggplot(temp, aes(x=BP, y=logP)) +
geom_point( aes(color=SNP_type), alpha = 0.8, size=1.3) +
scale_color_manual(values = color_duo) +
geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20") +
theme_bw() +
theme( legend.position="none", panel.border = element_blank(),
panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(),
axis.title.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.background = element_rect(colour="white", fill="white"),
plot.margin = unit(c(0, 0, 0, 0), "cm")) +
facet_grid(cols = vars(CHR), scales = "free_x")
}
# Manhattan plot function 2
Manhattan_faceted_pheno <- function(pheno_list, color_duo) {
results_GEMMA %>% filter(Phenotype %in% pheno_list) %>%
inner_join(., results_GEMMA_for_plot) %>%
arrange(CHR, BP) %>%
mutate(BPcum = BP+tot) %>%
mutate(logP = -log10(P))%>%
filter(logP > 2) %>%
ggplot(., aes(x=BPcum, y=logP)) +
geom_point( aes(color=as.factor(CHR)), alpha=0.8, size=1.3) +
scale_color_manual(values = rep(color_duo, 22 )) +
scale_x_continuous( label = axisdf$CHR, breaks= axisdf$center ) +
geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20") +
theme_bw() +
theme(legend.position="none",
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
facet_grid(rows = vars(Phenotype), scales = "free_y")
}
# Top SNPs
top_SNPs_density <- function(pheno, min_length_locus, custom_colors){
temp = significant %>%
filter(Phenotype == pheno) %>%
group_by(CHR, Locus_nb) %>%
mutate(Locus_length = 1 + max(BP) - min(BP)) %>%
filter(Locus_length >= min_length_locus) %>%
mutate(Rank = rank(P, ties.method = "first")) %>%
arrange(Rank) %>%
ungroup() %>%
filter(Rank == 1) %>%
dplyr::select(SNP, CHR, BP)
relevant_climate = climate_per_coordinates %>%
pivot_longer(-c(X,Y), names_to = "phenotype", values_to = "Bioclim") %>%
filter(phenotype == pheno) %>%
left_join(., Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude),
by = c("X" = "Longitude", "Y" = "Latitude"))
temp = read_tsv(paste0(GEA_dir, "Significant_GEMMA.ann.tab"), na = ".") %>%
inner_join(temp, by = c("CHROM" = "CHR", "POS" = "BP")) %>%
pivot_longer(-c(CHROM, POS, REF, ALT, SNP),
names_to = "ID_file", values_to = "Allele") %>%
inner_join(., relevant_climate) %>%
filter(Allele != "NA")
temp %>%
ggplot(aes(Bioclim, fill = as.factor(Allele), col = as.factor(Allele))) +
geom_density(alpha = .4) +
facet_wrap(vars(CHROM, POS), scales = "free") +
theme_bw() +
scale_color_manual(values = custom_colors) +
scale_fill_manual(values = custom_colors)
#temp %>% dplyr::count(SNP, Allele) %>%
# pivot_wider(names_from = Allele, values_from = n)
}
# Gene/TE tracks plot
plot_gff <- function(gff_name, chromosome, min_coord, max_coord, genes_to_highlight, col = "black") {
gff = read_tsv(gff_name,
col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot")) %>%
separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
mutate(ID = str_remove(ID, "ID=")) %>%
filter(CHR == chromosome) %>%
filter(Start >= min_coord) %>%
filter(Stop <= max_coord) %>%
mutate(y_value = rank(Start)) %>%
arrange(Start)
## If too many genes, organize them
if (nrow(gff) > 5) {
temp = ceiling(nrow(gff)/5)
gff = gff %>% mutate(y_value = rep(c(1:5), temp)[1:nrow(gff)])
}
## Make plot
c = gff %>%
ggplot() +
geom_segment(mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value), size = 2, col = col) +
theme_bw() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank(),
axis.title.x = element_blank(), axis.ticks.y = element_blank(),
panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank()) +
coord_cartesian(xlim = c(min_coord, max_coord), ylim = c(min(gff$y_value) - 0.5, max(gff$y_value) + 0.5))
## Highlight interesting genes
temp = match(genes_to_highlight, gff$ID)
if ( sum(!is.na(temp)) > 0 ){
c = c + geom_segment(data = filter(gff, ID == genes_to_highlight),
mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value),
size = 3, col = "blue")
}
## If there are a reasonable number of genes,
# I will also add their name on the plot
if (nrow(gff) <= 15) {
c = c + geom_text(aes(x = Stop + (max_coord - min_coord)*0.05 , y = y_value, label = ID), size = 2)
}
c
}
I want to have a main plot for temperature and for precipitation. First, I choose the maximum and minimum temperature.
# Manhattan plot with both max and min temperature with mirrored effect
p = Manhattan_one_pheno("Mean Temperature of Warmest Quarter", c("#F5B9B3", "#F28482")) +
labs(y = str_wrap("Mean Temperature of Warmest Quarter (-logP)", 25))
q = Manhattan_one_pheno("Min Temperature of Coldest Month", c("#C4E7FD", "#0889D9")) +
labs(y = str_wrap("Min Temperature of Coldest Month (-logP)", 25)) +
scale_y_reverse() + theme(strip.text.x = element_blank())
cowplot::plot_grid(p, q, nrow = 2)
ggsave(paste0(fig_dir, "GEA_Manhattan_temperature_main.pdf"), width = 18, height = 10, units = "cm")
top_SNPs_density("Mean Temperature of Warmest Quarter", 2000, c("#F5B9B3", "#F28482", "grey"))
top_SNPs_density("Min Temperature of Coldest Month", 2000, c("#C4E7FD", "#0889D9", "grey"))
Then, for precipitation, I choose the precipitation of the driest and of the wettest month.
# Manhattan plot for core precipitation
p = Manhattan_one_pheno("Precipitation of Driest Month", c("#FFD399", "#ff9f1c")) +
labs(y = str_wrap("Precipitation of Driest Month (-logP)", 25))
q = Manhattan_one_pheno("Precipitation of Wettest Month", c("#cbf3f0", "#2ec4b6")) +
scale_y_reverse() +
labs(y = str_wrap("Precipitation of Wettest Month (-logP)", 25))
cowplot::plot_grid(p, q, nrow = 2)
ggsave(paste0(fig_dir, "GEA_Manhattan_rain_main.pdf"), width = 18, height = 10, units = "cm")
top_SNPs_density("Precipitation of Driest Month", 2000, c("#FFD399", "#ff9f1c", "grey"))
top_SNPs_density("Precipitation of Wettest Month", 2000, c("#cbf3f0", "#2ec4b6", "grey"))
I also want to have a Manhattan plot for all possible environmental variable. I do try to classify them, grouping conceptually similar variables together.
#Yearly variables
Manhattan_faceted_pheno(Bioclim_var[c(2, 3, 7)], c("grey", "skyblue")) +
labs(title = "Ranges bioclimatic variables")
Manhattan_faceted_pheno(Bioclim_var[c(4, 15)], c("grey", "skyblue")) +
labs(title = "Seasonality bioclimatic variables")
Manhattan_faceted_pheno(Bioclim_var[c(1, 12)], c("grey", "skyblue")) +
labs(title = "Annual rain and temperature")
#Temperatures
Manhattan_faceted_pheno(warm_temp_var, c("grey", "skyblue")) +
labs(title = "High temperatures")
Manhattan_faceted_pheno(cold_temp_var, c("grey", "skyblue")) +
labs(title = "Cold temperatures")
#Rain
Manhattan_faceted_pheno(drought_var, c("grey", "skyblue")) +
labs(title = "Drought")
Manhattan_faceted_pheno(rainy_var, c("grey", "skyblue")) +
labs(title = "Rain")
#Mix rain and temperature
Manhattan_faceted_pheno(Bioclim_var[c(8, 9, 17, 18)], c("grey", "skyblue")) +
labs(title = "Mix rain and temperature")
i=10
#SNP_name = "3_444010"
#SNP_name = "7_1449518"
SNP_name = "10_460207"
temp2 = raster(paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif"))
temp = as.data.frame(rasterToPoints(temp2, xy = TRUE))
colnames(temp) <- c("X", "Y", "Bioclim_var")
map1 = ggplot() +
geom_raster(data = temp ,
aes(x = X, y = Y,
fill = Bioclim_var)) +
theme_void() +
scale_fill_viridis("inferno") +
theme(panel.border = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))
#panel.background = element_rect(fill = "aliceblue"))
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70))
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80))
p3 = map1 + coord_cartesian(xlim=c(105, 175), ylim=c(-65, 10))
aus_map = cowplot::plot_grid(p3 + theme(legend.position = "None"), get_legend(p1), ncol = 2, rel_widths = c(1.1, 1))
ligne = cowplot::plot_grid(p1 + theme(legend.position = "None"), aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2 + theme(legend.position = "None"), ligne, ncol = 2, rel_widths = c(0.7, 1))
ggsave(paste0(fig_dir, "Map_BIO", i, ".pdf"), width = 18, height = 10, units = "cm")
allele = significant %>%
filter(Phenotype == Bioclim_var[i]) %>%
filter(SNP == SNP_name) %>%
dplyr::select(SNP, CHR, BP) %>%
inner_join(., read_tsv(paste0(GEA_dir, "Significant_GEMMA.ann.tab"), na = "."), by = c("CHR" = "CHROM", "BP" = "POS")) %>%
pivot_longer(-c(CHR, BP, REF, ALT, SNP),
names_to = "ID_file", values_to = "Allele") %>%
inner_join(., Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
mutate(Latitude = round(Latitude/2)*2, Longitude = round(Longitude/2)*2)%>%
mutate(Allele = ifelse(Allele == 0, "REF", ifelse(Allele == 1, "ALT1", "ALT2"))) %>%
group_by(Allele, Longitude, Latitude) %>%
dplyr::count() %>%
pivot_wider(names_from = Allele, values_from = n, values_fill = 0) %>%
filter(!is.na(Longitude)) %>%
mutate(Radius = 2)
#Add if column alt2
if ("ALT2" %in% colnames(allele)) {
allele = allele
} else {
allele = allele %>% mutate(ALT2 = 0)
}
p1 = ggplot() + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) +
geom_scatterpie(data = allele, mapping = aes(x=Longitude, y=Latitude, r = Radius),
cols = c("REF", "ALT1", "ALT2"), color=NA, alpha=.8) +
theme(panel.background = element_rect(fill = "transparent"), # bg of the panel
plot.background = element_rect(fill = "transparent", color = NA), # bg of the plot
panel.grid.major = element_blank(), # get rid of major grid
panel.grid.minor = element_blank(), # get rid of minor grid
legend.background = element_rect(fill = "transparent"), # get rid of legend bg
legend.box.background = element_rect(fill = "transparent"))
p2 = ggplot() + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) +
geom_scatterpie(data = allele, mapping = aes(x=Longitude, y=Latitude, r = Radius*2),
cols = c("REF", "ALT1", "ALT2"), color=NA, alpha=.8)+
theme(panel.background = element_rect(fill = "transparent"), # bg of the panel
plot.background = element_rect(fill = "transparent", color = NA), # bg of the plot
panel.grid.major = element_blank(), # get rid of major grid
panel.grid.minor = element_blank(), # get rid of minor grid
legend.background = element_rect(fill = "transparent"), # get rid of legend bg
legend.box.background = element_rect(fill = "transparent"))
p3 = ggplot() + coord_cartesian(xlim=c(105, 175), ylim=c(-65, 10)) +
geom_scatterpie(data = allele, mapping = aes(x=Longitude, y=Latitude, r = Radius*2),
cols = c("REF", "ALT1", "ALT2"), color=NA, alpha=.8)+
theme(panel.background = element_rect(fill = "transparent"), # bg of the panel
plot.background = element_rect(fill = "transparent", color = NA), # bg of the plot
panel.grid.major = element_blank(), # get rid of major grid
panel.grid.minor = element_blank(), # get rid of minor grid
legend.background = element_rect(fill = "transparent"), # get rid of legend bg
legend.box.background = element_rect(fill = "transparent"))
#Plotting all the maps together!
aus_map = cowplot::plot_grid(p3 + theme(legend.position = "None"), get_legend(p1), ncol = 2, rel_widths = c(1.1, 1))
ligne = cowplot::plot_grid(p1 + theme(legend.position = "None"), aus_map, ncol = 1, rel_heights = c(1, 0.7))
cowplot::plot_grid(p2 + theme(legend.position = "None"), ligne, ncol = 2, rel_widths = c(0.7, 1))
ggsave(paste0(fig_dir, "Allele_", SNP_name,"_BIO", i, ".pdf"), width = 18, height = 10, units = "cm")
#Set parameters for the plot
chromosome = 7
min_coord = 1094086
max_coord = 1892455
#pheno = "Precipitation of Driest Month"
pheno = "Precipitation of Warmest Quarter"
genes_to_highlight = c("Zt09_model_5_00004")
TE_gff = "/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Cecile_IPO323_refTEs_match.gff"
gene_gff = "/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3"
# Prepare the gene section of the plot
c = plot_gff(TE_gff, chromosome, min_coord, max_coord, genes_to_highlight, "grey50")
d = plot_gff(gene_gff, chromosome, min_coord, max_coord, genes_to_highlight)
# Prepare the GWAS section of the plot for the first phenotype
b = results_GEMMA %>%
filter(Phenotype == pheno) %>%
filter(CHR == chromosome) %>%
filter(BP >= min_coord) %>%
filter(BP <= max_coord) %>%
mutate(logP = -log10(P)) %>%
ggplot() +
geom_point(aes(x = BP, y = logP, col = logP)) +
geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
linetype = "dashed", color = "grey20") +
theme_bw() +
labs(title = paste0("GWAS: ", pheno),
subtitle = paste0("The region is on the chromosome ", chromosome,
" from ", min_coord, "bp to ", max_coord, "bp.")) +
theme(axis.title.x = element_blank())
cowplot::plot_grid(b, c, d, ncol = 1, align = 'v', axis = 'lr', rel_heights = c(1, 0.3, 0.3))
This is based on the same scripts made for Fanny Hartmann’s paper on the pairs of populations: looking for alleles already known to be involved in resistance to fungicide.
genotypes_resistance = read_tsv(paste0(fung_dir, "genotypes_tidy_format.tab"),
col_names = c("temp", "ID_file", "Allele")) %>%
separate(temp, into = c("Gene", "AA_change"), sep = "\\.") %>%
dplyr::mutate(AA_REF = str_sub(AA_change, 1, 1)) %>%
dplyr::mutate(Allele_type = ifelse(AA_REF == Allele, "Origin", "Mutant")) %>%
left_join(Zt_meta)
genotypes_resistance %>%
dplyr::count(AA_change, Gene, Allele_type) %>%
ggplot(aes(x=n, y=AA_change, fill = Allele_type)) +
geom_barh(stat="identity") +
facet_grid(Gene ~ ., scales = "free_y", space = "free_y")
I suspect that there will be an effect of both geography and time on the frequency of these alleles and so I try to represent this here.
#CYP51
temp = genotypes_resistance %>%
filter(Gene == "CYP51") %>%
dplyr::count(AA_change, Continent,Sampling_Date, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(prop_mutant = Mutant/(Mutant + Origin)) %>%
filter(!is.na(Sampling_Date)) %>%
filter(!is.na(Continent)) %>%
group_by(AA_change) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0)
temp %>%
ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, alpha =0.7) + theme_bw() +
facet_wrap(.~AA_change) +
scale_fill_gradient(low="white", high = "#16697a") +
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in the gene CYP51"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance to azole fungicides."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
#Beta_tubulin
genotypes_resistance %>%
filter(Gene == "beta_tubulin") %>%
dplyr::count(AA_change, Continent,Sampling_Date, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Mutant + Origin) %>%
mutate(prop_mutant = Mutant/Number_all) %>%
group_by(AA_change) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0) %>%
ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, alpha =0.7) + theme_bw() +
facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a")+
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in the beta tubuline gene"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance",
" to benzimidazole fungicides."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
# Mitochondrial_cytb
genotypes_resistance %>%
filter(Gene == "mitochondrial_cytb") %>%
dplyr::count(AA_change, Continent,Sampling_Date, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Mutant + Origin) %>%
mutate(prop_mutant = Mutant/Number_all) %>%
group_by(AA_change) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0) %>%
ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, alpha =0.7) + theme_bw() +
facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a") +
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in the mitochondrial gene cytb"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance ",
"to QoI, or Quinone outside inhibitors."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
# SDH genes
genotypes_resistance %>%
filter(grepl("SDH", Gene)) %>%
unite(Label, Gene, AA_change, remove = T) %>%
dplyr::count(Label, Continent, Sampling_Date, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Mutant + Origin) %>%
mutate(prop_mutant = Mutant/Number_all) %>%
group_by(Label) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0) %>%
ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, alpha =0.7) + theme_bw() +
facet_wrap(.~Label) + scale_fill_gradient(low="white", high = "#16697a")+
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in one of the SDH genes"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance to SDHI fungicides."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
I use the alleles identified in the GWAS of Thierry Marcel and track them in the world-wide populations.
virulence_table = read_tsv(paste0(virulence_dir, "French_GWAS_virulence_alleles.tab")) %>%
separate(VIRULENCE_ALLELES, into = c("VIRULENCE_ALLELE1", "VIRULENCE_ALLELE2"), sep = ",")
french_GWAS_variants = read_tsv(paste0(virulence_dir, "Positions_GWAS.short.vcf"),
na = ".") %>%
pivot_longer(-c(`#CHROM`, POS, REF, ALT), names_to = "ID_file", values_to = "Allele_nb") %>%
separate(ALT, into = c("ALT1", "ALT2"), sep = ",") %>%
mutate(Allele_ID = ifelse(Allele_nb == 0, REF, ifelse(Allele_nb == 1, ALT1, ALT2))) %>%
left_join(., virulence_table) %>%
mutate(virulence = ifelse(Allele_ID == NON_VIRULENT_ALLELES, "Non_virulent",
ifelse(Allele_ID == VIRULENCE_ALLELE1 | Allele_ID == VIRULENCE_ALLELE2,
"Virulent", NA)))
french_GWAS_variants = inner_join(french_GWAS_variants, Zt_meta)
french_GWAS_variants %>%
tidyr::unite(position, `#CHROM`, POS) %>%
dplyr::count(position, Continent,Sampling_Date, virulence) %>%
pivot_wider(names_from = virulence, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Virulent + Non_virulent) %>%
mutate(prop_virulent = Virulent/Number_all) %>%
ggplot(aes(x=Sampling_Date, y=Continent, size = prop_virulent, fill = prop_virulent)) +
geom_point(shape=21, alpha =0.7) + theme_bw() +
scale_fill_gradient(low="white", high = "#16697a")+
labs(title = str_wrap(paste("Virulence alleles",
""), width = 35),
subtitle = str_wrap(paste("These alleles were identified in a GWAS study",
" to increase either virulence or aggressiveness",
" on some wheat varieties."), width = 65),
fill = "Proportion of virulent",
size = "Proportion of virulent",
x = "Years", y = "") +
facet_wrap(vars(position))
Welll… Whatever…
#On the cluster: run blast
while read sample ; do sh Detect_gene_blast.sh ./Directories_new.sh /data2/alice/WW_project/7_Virulence/Avr_Stb6.fasta ${sample} Illumina /data2/alice/WW_project/7_Virulence/${sample}_Avr_Stb6.blast.tab; done < list_Third_batch_Dec_2018.txt
#On the cluster: Gather blast results
cat /data2/alice/WW_project/7_Virulence/*Avr_Stb6.blast.tab > /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt
#On the cluster: Gather blast results
while read p ;
do
sample=$(echo $p | cut -f 1 -d " " ) ;
chr=$(echo $p | cut -f 4 -d " ") ;
start=$(echo $p | cut -f 11 -d " ") ; end=$(echo $p | cut -f 12 -d " ") ;
echo $sample, $chr, $start, $end ;
order_start=$(echo -e "$start\n$end" | sort -n | head -n1);
order_end=$(echo -e "$start\n$end" | sort -nr | head -n1);
echo -e "${chr}\t${order_start}\t${order_end}" > temp.bed ;
~/Software/bedtools getfasta \
-fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta\
-bed temp.bed | \
sed "s/>/>${sample}:/" >> \
/data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.fasta ;
done < /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt
#And on the website, because laziness
mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output
Here is the tree for the Avr_Stb6 gene. For reference, the allele carried by ST99CH_1A4 gives an avirulent phenotype, while the allele from ST99CH_1A5 is the virulent allele (from Zhong et al. 2017).
#grep ">" ../7_Virulence/Overall_results_blast_Avr_Stb6.fasta | sed 's/>//' > ../7_Virulence/Overall_results_blast_Avr_Stb6.list
temp2 = read_tsv(paste0(virulence_dir, "Overall_results_blast_Avr_Stb6.list"), col_names = "Fasta_header") %>%
separate(Fasta_header, into = c("ID_file", "chr", "coords"), sep=":", remove = F) %>%
mutate(contig2 = stringr::str_replace_all(Fasta_header, "[:.]", "_")) %>%
left_join(.,Zt_meta)
#Read tree in
#details from the mafft website about the tree
#Size = 640 sequences × 319 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 0
#Alignment id = .200930173318269H5fPcXaMbnDOkoF7E0Y3ilsfnormal
tree_file = "Tree_Avr_Stb6.nwk"
tree = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file))) %>%
mutate(label_copy = label) %>%
separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
dplyr::mutate(nb = as.integer(nb)) %>%
dplyr::mutate(label_OG = label) %>%
dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
left_join(., temp2, by = c("contig2")) %>%
dplyr::mutate(label_temp = ifelse(is.na(ID_file), ifelse(is.na(contig), label, contig), ID_file)) %>%
unite(col = "label_new", nb, label_temp, sep = "_")
tree2 = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file)))
temp = tree %>%
mutate(original_vir = ifelse(ID_file == "ST99CH_1A5", "ST99CH_1A5",
ifelse(ID_file == "ST99CH_1E4", "ST99CH_1E4", NA))) %>%
dplyr::select(label, node, label_new, Continent, Sampling_Date, Collection, original_vir) %>%
filter(!is.na(label))
#Find the outgroup!
root_node = tree2 %>%
filter(grepl("Zp13", label)) %>%
dplyr::select(node) %>%
pull()
rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))
p2 <- p + geom_tippoint(aes(color = Continent), alpha = 0.7) +
theme(legend.position = "right") +
Color_Continent
#p2
p3 <- p + geom_tippoint(aes(color = original_vir)) +
theme(legend.position = "right") +
scale_color_manual(values = c(mycolorsCorrel[1], mycolorsCorrel[20]))
cowplot::plot_grid(p2, p3)
p4 <- p + geom_tippoint(aes(color = Sampling_Date), alpha = 0.7) +
theme(legend.position = "right")
cowplot::plot_grid(p4, p3)